library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(WGCNA)
Loading required package: dynamicTreeCut
Loading required package: fastcluster

Attaching package: ‘fastcluster’

The following object is masked from ‘package:stats’:

    hclust


Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'rmarkdown':
  method         from
  print.paged_df     
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: ‘WGCNA’

The following object is masked from ‘package:stats’:

    cor
library(gplots)

Attaching package: ‘gplots’

The following object is masked from ‘package:stats’:

    lowess
options(stringsAsFactors = FALSE)

load sample info

sample.description <- read.csv("../input/sample.description.csv")

load reads

lcpm <- read.csv("../output/log2cpm.csv.gz", row.names = 1, check.names = FALSE)
head(lcpm)
dim(lcpm)
[1] 26704    48

Filter for genes with the highest coefficient of variation

CV <- apply(lcpm, 1, \(x) abs(sd(x)/mean(x)))
hist(log10(CV))

names(CV) <- rownames(lcpm)
CV[str_detect(names(CV), "18G076300|33G031700")]
Ceric.18G076300.v2.1 Ceric.33G031700.v2.1 
           0.1026816            0.2265241 
quantile(CV, 0.24)
      24% 
0.1016702 

LFY2 has a pretty low CV; have to include 76% of the genes by CV to include it.

lcpm.filter <- lcpm[CV > quantile(CV, 0.24),]
dim(lcpm.filter)
[1] 20295    48

WGCNA wants genes in columns

lcpm.filter.t <- t(lcpm.filter)

Start by including all samples

Soft thresholding

powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(lcpm.filter.t, powerVector = powers, verbose = 5,networkType = "signed hybrid", blockSize = 21000)
 pickSoftThreshold: calculating connectivity for given powers...
   ..working on genes 1 through 20295 of 20295
Warning: executing %dopar% sequentially: no parallel backend registered
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 <- 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

choose 5

softPower <- 5
adjacency <- adjacency(lcpm.filter.t, power = softPower, type = "signed hybrid")
# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency, TOMType = "signed");
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
dissTOM <- 1-TOM
# Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)

define modules

# We like large modules, so we set the minimum module size relatively high:
minModuleSize <- 30;
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
                             deepSplit <- 2, pamRespectsDendro = FALSE,
                             minClusterSize = minModuleSize);
 ..done.
table(dynamicMods)
dynamicMods
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20 
3985 1679 1409 1303 1203 1073  879  852  668  523  494  474  463  432  418  398  394  375  330  324 
  21   22   23   24   25   26   27   28   29   30   31   32   33   34 
 304  301  268  230  224  195  175  171  164  148  135  131  105   68 
# Convert numeric labels into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
dynamicColors
         black           blue          brown           cyan      darkgreen       darkgrey 
           879           1679           1409            432            301            230 
   darkmagenta darkolivegreen     darkorange        darkred  darkturquoise          green 
            68            105            195            304            268           1203 
   greenyellow         grey60      lightcyan     lightgreen    lightyellow        magenta 
           494            394            398            375            330            668 
  midnightblue         orange  paleturquoise           pink         purple            red 
           418            224            135            852            523           1073 
     royalblue    saddlebrown         salmon        skyblue      steelblue            tan 
           324            164            463            171            148            474 
     turquoise         violet          white         yellow 
          3985            131            175           1303 
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")

merge similar modules

# Calculate eigengenes
MEList <- moduleEigengenes(lcpm.filter.t, colors = dynamicColors)
MEs <- MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss <- 1-cor(MEs);
# Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")

merge with correlation > 0.75 (Change from default of 0.8 / 0.2())

MEDissThres = 0.25
# Plot the cut line into the dendrogram
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")
abline(h=MEDissThres, col = "red")

# Call an automatic merging function
merge = mergeCloseModules(lcpm.filter.t, dynamicColors, cutHeight = MEDissThres, verbose = 3)
 mergeCloseModules: Merging modules whose distance is less than 0.25
   multiSetMEs: Calculating module MEs.
     Working on set 1 ...
     moduleEigengenes: Calculating 34 module eigengenes in given set.
   multiSetMEs: Calculating module MEs.
     Working on set 1 ...
     moduleEigengenes: Calculating 26 module eigengenes in given set.
   multiSetMEs: Calculating module MEs.
     Working on set 1 ...
     moduleEigengenes: Calculating 25 module eigengenes in given set.
   Calculating new MEs...
   multiSetMEs: Calculating module MEs.
     Working on set 1 ...
     moduleEigengenes: Calculating 25 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs

compare pre and post merge

sizeGrWindow(12, 9)
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

#dev.off()
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs 
table(merge$colors)

        black          blue         brown          cyan     darkgreen      darkgrey   darkmagenta 
          879          2994          2612           432           406           230            68 
   darkorange       darkred darkturquoise   greenyellow        grey60     lightcyan    lightgreen 
          195           304           492           494           394           398           375 
  lightyellow       magenta  midnightblue paleturquoise        purple   saddlebrown       skyblue 
          330          1741           418           459           523           312           171 
          tan     turquoise        violet         white 
          474          5288           131           175 
length(table(merge$colors))
[1] 25
median(table(merge$colors))
[1] 406

Look at modules

Which module is LFY in?

CrLFY1 <- "Ceric.33G031700" 

CrLFY2 <- "Ceric.18G076300"
module.assignment <- tibble(geneID=colnames(lcpm.filter.t), module = mergedColors)

module.assignment %>%
  filter(str_detect(geneID, "18G076300|33G031700"))

Interesting: they are in different modules(!).

module.assignment %>% group_by(module) %>% summarize(n_genes = n()) %>% arrange(n_genes)

Plot eigengenes

Make sure sample info sheet is in the correct order.

rownames(lcpm.filter.t) %>% str_replace_all("\\.", "-") == sample.description$sample
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[39] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
sample.eigen <- cbind(sample.description, MEs)
sample.eigen
sample.eigen %>%
  mutate(gt_tissue=str_c(tissue, "-", base_gt)) %>%
  pivot_longer(starts_with("ME"), names_to = "ME") %>%
  ggplot(aes(x=gt_tissue, y = value, color = tissue)) +
  geom_point() +
  facet_wrap(~ME, ncol=5) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) +
  scale_color_brewer(type="qual", palette = "Set3")

A heat map:

MEs.m <- as.matrix(MEs)
heatmap.2(MEs.m, trace="none", cexRow= 0.6, col="bluered")

save(module.assignment, MEs, lcpm.filter, CrLFY1, CrLFY2, file="../output/WGCNA_allSamples.Rdata")
---
title: "04_WGCNA"
author: "Julin Maloof"
date: "2025-02-16"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library(tidyverse)
library(WGCNA)
library(gplots)
options(stringsAsFactors = FALSE)
```

load sample info
```{r}
sample.description <- read.csv("../input/sample.description.csv")
```


load reads

```{r}
lcpm <- read.csv("../output/log2cpm.csv.gz", row.names = 1, check.names = FALSE)
head(lcpm)
dim(lcpm)
```

Filter for genes with the highest coefficient of variation

```{r}
CV <- apply(lcpm, 1, \(x) abs(sd(x)/mean(x)))
hist(log10(CV))
```

```{r}
names(CV) <- rownames(lcpm)
CV[str_detect(names(CV), "18G076300|33G031700")]
```

```{r}
quantile(CV, 0.24)
```

LFY2 has a pretty low CV; have to include 76% of the genes by CV to include it.
```{r}
lcpm.filter <- lcpm[CV > quantile(CV, 0.24),]
dim(lcpm.filter)
```




WGCNA wants genes in columns

```{r}
lcpm.filter.t <- t(lcpm.filter)
```

Start by including all samples

Soft thresholding
```{r}
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(lcpm.filter.t, powerVector = powers, verbose = 5,networkType = "signed hybrid", blockSize = 21000)
```
```{r}
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 <- 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
```
choose 5

```{r}
softPower <- 5
adjacency <- adjacency(lcpm.filter.t, power = softPower, type = "signed hybrid")
# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency, TOMType = "signed");
dissTOM <- 1-TOM
```

```{r}
# Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)
```

define modules

```{r}
# We like large modules, so we set the minimum module size relatively high:
minModuleSize <- 30;
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
                             deepSplit <- 2, pamRespectsDendro = FALSE,
                             minClusterSize = minModuleSize);
table(dynamicMods)
```

```{r}
# Convert numeric labels into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")
```

merge similar modules

```{r}
# Calculate eigengenes
MEList <- moduleEigengenes(lcpm.filter.t, colors = dynamicColors)
MEs <- MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss <- 1-cor(MEs);
# Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")
```

merge with correlation > 0.75
(Change from default of 0.8 / 0.2())
```{r}
MEDissThres = 0.25
# Plot the cut line into the dendrogram
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(lcpm.filter.t, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
```

compare pre and post merge
```{r}
sizeGrWindow(12, 9)
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
#dev.off()
```

```{r}
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs 
table(merge$colors)
length(table(merge$colors))
median(table(merge$colors))

```

## Look at modules

Which module is LFY in?

```{r}
CrLFY1 <- "Ceric.33G031700" 

CrLFY2 <- "Ceric.18G076300"
```

```{r}
module.assignment <- tibble(geneID=colnames(lcpm.filter.t), module = mergedColors)

module.assignment %>%
  filter(str_detect(geneID, "18G076300|33G031700"))
```
Interesting: they are in different modules(!).  

```{r}
module.assignment %>% group_by(module) %>% summarize(n_genes = n()) %>% arrange(n_genes)
```

Plot eigengenes

Make sure sample info sheet is in the correct order.
```{r}
rownames(lcpm.filter.t) %>% str_replace_all("\\.", "-") == sample.description$sample
```

```{r}
sample.eigen <- cbind(sample.description, MEs)
sample.eigen
```

```{r, fig.height=6, fig.width=10}
sample.eigen %>%
  mutate(gt_tissue=str_c(tissue, "-", base_gt)) %>%
  pivot_longer(starts_with("ME"), names_to = "ME") %>%
  ggplot(aes(x=gt_tissue, y = value, color = tissue)) +
  geom_point() +
  facet_wrap(~ME, ncol=5) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) +
  scale_color_brewer(type="qual", palette = "Set3")
```
A heat map:

```{r, fig.height=7}
MEs.m <- as.matrix(MEs)
heatmap.2(MEs.m, trace="none", cexRow= 0.6, col="bluered")
```


```{r}
save(module.assignment, MEs, lcpm.filter, CrLFY1, CrLFY2, file="../output/WGCNA_allSamples.Rdata")
```

